########################################################
# Project:    Commission Communication
# Task:       Extract language-based indicators from 
#             comparison corpora
# Author:     Christian Rauh (12.01.2021)
########################################################

# Packages ####
library(tidyverse) # 1.3.0
library(quanteda) # 3.2.0
library(sophistication) # 0.70 - https://github.com/kbenoit/sophistication
library(spacyr) # 1.2.1
library(textcat) # 1.0-7

# Other tools ####

# Edited functions from sophistication package
# which excludes stopwords from calculating term familiarity
source("./Tools/covars_make_baselines_CR.R")



# Political Science Abstracts ####

start <- Sys.time()

# Clean version of the PolSci corpus
corpus <- read_rds("./Corpora/PolSciAbstracts.rds") %>% 
  rename(text = abstract) %>% 
  mutate(year = 2010) # Highest in Google ngram corp, data from 2010-2021

# Sample for testing
# corpus <- corpus %>% sample_n(100, replace = FALSE)

# Identifier variable
corpus$cid <- 1:nrow(corpus)

# Quanteda corpus object
qcorp <- corpus(corpus$text, docvars = data.frame(corpus[, c("cid", "year")]))
docids <- docvars(qcorp) %>% 
  mutate(doc_id = as.character(docid(qcorp)))


# Reading ease scores, based on sophistication package
re <- covars_make(qcorp, readability_measure = "Flesch") 
re$doc_id <- paste0("text",rownames(re))

# Google N-Gram familiarity measures
# Based on modified function from sophistication package
fam <- covars_make_baselines_CR(qcorp, baseline_year = docvars(qcorp, "year"))
fam$doc_id <- rownames(fam)

# Part-of-speech distributions (sophistication/spacyr)
# Puts out doc_id itself, order not necessarily correct
pos <- covars_make_pos(qcorp)

# Combine data

indicators <- merge(re[, c("doc_id", "meanSentenceLength", "Flesch")], 
                    fam[, c("doc_id","google_mean_local")], 
                    by = "doc_id") %>% 
  rename(familiarity = google_mean_local,
         flesch = Flesch)

indicators <- merge(indicators,
                    pos[, c("doc_id","n_namedentities", "n_noun", "n_verb", "n_sentence", "ntoken")],
                    by = "doc_id")

indicators <- merge(indicators,
                    docids[ ,c("doc_id", "cid")],
                    by = "doc_id")

corpus <- merge(corpus, indicators, by = "cid") %>% 
  mutate(nominal = n_noun/n_verb) %>% 
  select(-c(cid, doc_id, url))


# Export
write_rds(corpus, "./Data/Comp-PolSci_Language.Rds")

end <- Sys.time()
duration1 <- end-start
duration1



# Newspaper texts (BNC) ####

start <- Sys.time()

# Clean version of the BNC corpus
bnc <- read_rds("./Corpora/BNC_RawTexts.rds") %>% 
  filter(type == "NEWS") %>% 
  filter(str_detect(category, "brdsht|tabloid")) %>% 
  mutate(newspaper = ifelse(str_detect(category, "brdsht"), "Broadsheet", "Tabloid"))

# Smaller chunks
# BNC contains samples of newspaper chunked in big blocks
# Separate them into original paragraphs here

corpus <- data.frame(text = character(0),
                     newspaper = character(0))

for (i in 1:nrow(bnc)) {
  print(i)
  texts <- str_split(bnc$text[i], "\n") %>% 
    data.frame() %>% 
    rename(text = 1) %>% 
    mutate(newspaper = bnc$newspaper[i])
  corpus <- rbind(corpus, texts)
}

rm(bnc)

# Clean up a little
corpus$text <- corpus$text %>% 
  str_replace_all("([a-z])([[:punct:]])([A-Z])", "\\1\\2 \\3") # Missing whitespaces after punctuation

corpus$year <- 1994 # Publication year of corpus

# Sample for testing
# corpus <- corpus %>% sample_n(100, replace = FALSE)

# Identifier variable
corpus$cid <- 1:nrow(corpus)

# Quanteda corpus object
qcorp <- corpus(corpus$text, docvars = data.frame(corpus[, c("cid", "year")]))
docids <- docvars(qcorp) %>% 
  mutate(doc_id = as.character(docid(qcorp)))


# Reading ease scores, based on sophistication package
re <- covars_make(qcorp, readability_measure = "Flesch") 
re$doc_id <- paste0("text",rownames(re))

# Google N-Gram familiarity measures
# Based on modified function from sophistication package
fam <- covars_make_baselines_CR(qcorp, baseline_year = docvars(qcorp, "year"))
fam$doc_id <- rownames(fam)

# Part-of-speech distributions (sophistication/spacyr)
# Puts out doc_id itself, order not necessarily correct
pos <- covars_make_pos(qcorp)

# Combine data

indicators <- merge(re[, c("doc_id", "meanSentenceLength", "Flesch")], 
                    fam[, c("doc_id","google_mean_local")], 
                    by = "doc_id") %>% 
  rename(familiarity = google_mean_local,
         flesch = Flesch)

indicators <- merge(indicators,
                    pos[, c("doc_id","n_namedentities", "n_noun", "n_verb", "n_sentence", "ntoken")],
                    by = "doc_id")

indicators <- merge(indicators,
                    docids[ ,c("doc_id", "cid")],
                    by = "doc_id")

corpus <- merge(corpus, indicators, by = "cid") %>% 
  mutate(nominal = n_noun/n_verb) %>% 
  select(-c(cid))


# Export
write_rds(corpus, "./Data/Comp-NewsBNC_Language.Rds")

end <- Sys.time()
duration2 <- end-start
duration2

